Léa NOIREAUX, Amine MOUSSA, Cécile PAILHE
# essential libraries
import json
import random
from urllib.request import urlopen
# File reading
import glob
from os import name, read
from os.path import basename, splitext
# storing and anaysis
import numpy as np
import pandas as pd
import datetime as dt
# visualization
import matplotlib.pyplot as plt
#import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from plotly.offline import plot, iplot, init_notebook_mode
#import calmap
#import folium
# converter
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
# hide warnings
import warnings
warnings.filterwarnings('ignore')
with open('data/departements-avec-outre-mer.geojson.txt', 'r') as json_file:
departement = json.load(json_file)
# importing datasets
def data_generator(folder_name):
filepath = glob.glob(folder_name + "/*")
for file in filepath: # for each file
filename = splitext(basename(file))[0] + ".txt" # File name
yield filename, pd.read_csv(folder_name + "/" + filename, sep="|")
def data_cleaning(data, perc_NaN):
print("Processing cleaning...")
num_cols = len(data.columns)
print("\n*" + str(data.isna().sum()))
# drop cols with "n%" of NaN and their NaN rows
print("\n* Dropping columns with more then " + str(perc_NaN) + "% of NaN")
min_count = int(((100-perc_NaN)/100)*data.shape[0] + 1)
data = data.dropna(axis=1, thresh=min_count)
print("-> " + str(num_cols - len(data.columns)) + " columns dropped!")
print("\n* Number of NaN after the first cleaning in the " + str(len(data.columns)) + " columns left:\n" + str(data.isna().sum()))
data = data.dropna()
return data
#Ajoute un 0 devant les code de département qui ont qu'un digit
def add0CodeDep(code):
if len(code)==1:
code='0'+code
return code
def convert_data(data):
# Convert columns
data["Code commune"] = data['Code commune'].apply(lambda x: str(int(x)))
data['Code departement']= data['Code departement'].apply(lambda x: str(x))
data['Code departement'] = data['Code departement'].apply(add0CodeDep)
data["Code postal"] = data["Code postal"].apply(lambda x: int(x))
data['Code postal'] = data['Code postal'].apply(lambda x: str(int(x)))
data['Valeur fonciere']=data['Valeur fonciere'].apply(lambda x: float('.'.join(x.split(','))))
data['Prix MC ST']=data['Valeur fonciere']/data['Surface terrain']
data['Prix MC ST']=data['Prix MC ST'].apply(lambda x : 0 if x== np.inf else x)
data['Prix MC SRB']=data['Valeur fonciere']/data['Surface reelle bati']
data['Prix MC SRB']=data['Prix MC SRB'].apply(lambda x : 0 if x== np.inf else x)
data['Nombre de vente']=1 #crée une colonne pour le nombre de vente remplie de 1
return data
# Create a dictionnary with clean datasets
clean_datasets = {}
for filename, data in data_generator("data/Valeurs Foncieres"):
# Extract year from filename
year = ''.join([ch for ch in filename if ch in '0123456789'])
print("\n----> Year: " + year)
# Clean data by dropping useless cols and rows / Converting values
clean_data = data_cleaning(data, perc_NaN=50)
clean_data = convert_data(clean_data)
clean_datasets[year] = clean_data
# Display data of 2020
clean_datasets["2020"]
----> Year: 2016 Processing cleaning... *Code service CH 2939478 Reference document 2939478 1 Articles CGI 2939478 2 Articles CGI 2939478 3 Articles CGI 2939478 4 Articles CGI 2939478 5 Articles CGI 2939478 No disposition 0 Date mutation 0 Nature mutation 0 Valeur fonciere 35643 No voie 1257084 B/T/Q 2816323 Type de voie 1292701 Code voie 23310 Voie 23338 Code postal 23494 Commune 0 Code departement 0 Code commune 0 Prefixe de section 2837406 Section 106 No plan 0 No Volume 2927451 1er lot 2050131 Surface Carrez du 1er lot 2700761 2eme lot 2747922 Surface Carrez du 2eme lot 2879371 3eme lot 2907565 Surface Carrez du 3eme lot 2933583 4eme lot 2928634 Surface Carrez du 4eme lot 2938018 5eme lot 2934306 Surface Carrez du 5eme lot 2938832 Nombre de lots 0 Code type local 1350027 Type local 1350027 Identifiant local 2939478 Surface reelle bati 1352568 Nombre pieces principales 1352568 Nature culture 901816 Nature culture speciale 2800349 Surface terrain 901816 dtype: int64 * Dropping columns with more then 50% of NaN -> 22 columns dropped! * Number of NaN after the first cleaning in the 21 columns left: No disposition 0 Date mutation 0 Nature mutation 0 Valeur fonciere 35643 No voie 1257084 Type de voie 1292701 Code voie 23310 Voie 23338 Code postal 23494 Commune 0 Code departement 0 Code commune 0 Section 106 No plan 0 Nombre de lots 0 Code type local 1350027 Type local 1350027 Surface reelle bati 1352568 Nombre pieces principales 1352568 Nature culture 901816 Surface terrain 901816 dtype: int64 ----> Year: 2017 Processing cleaning... *Code service CH 3382812 Reference document 3382812 1 Articles CGI 3382812 2 Articles CGI 3382812 3 Articles CGI 3382812 4 Articles CGI 3382812 5 Articles CGI 3382812 No disposition 0 Date mutation 0 Nature mutation 0 Valeur fonciere 45338 No voie 1395778 B/T/Q 3237253 Type de voie 1435654 Code voie 30113 Voie 30162 Code postal 30502 Commune 0 Code departement 0 Code commune 0 Prefixe de section 3238250 Section 113 No plan 0 No Volume 3373972 1er lot 2313128 Surface Carrez du 1er lot 3104314 2eme lot 3161194 Surface Carrez du 2eme lot 3314519 3eme lot 3346137 Surface Carrez du 3eme lot 3376163 4eme lot 3370210 Surface Carrez du 4eme lot 3381060 5eme lot 3376821 Surface Carrez du 5eme lot 3382060 Nombre de lots 0 Code type local 1508037 Type local 1508037 Identifiant local 3382812 Surface reelle bati 1510700 Nombre pieces principales 1510700 Nature culture 1079606 Nature culture speciale 3230429 Surface terrain 1079606 dtype: int64 * Dropping columns with more then 50% of NaN -> 22 columns dropped! * Number of NaN after the first cleaning in the 21 columns left: No disposition 0 Date mutation 0 Nature mutation 0 Valeur fonciere 45338 No voie 1395778 Type de voie 1435654 Code voie 30113 Voie 30162 Code postal 30502 Commune 0 Code departement 0 Code commune 0 Section 113 No plan 0 Nombre de lots 0 Code type local 1508037 Type local 1508037 Surface reelle bati 1510700 Nombre pieces principales 1510700 Nature culture 1079606 Surface terrain 1079606 dtype: int64 ----> Year: 2018 Processing cleaning... *Code service CH 3329147 Reference document 3329147 1 Articles CGI 3329147 2 Articles CGI 3329147 3 Articles CGI 3329147 4 Articles CGI 3329147 5 Articles CGI 3329147 No disposition 0 Date mutation 0 Nature mutation 0 Valeur fonciere 31915 No voie 1398919 B/T/Q 3184623 Type de voie 1444023 Code voie 30389 Voie 30430 Code postal 30556 Commune 0 Code departement 0 Code commune 0 Prefixe de section 3184294 Section 85 No plan 0 No Volume 3319162 1er lot 2294729 Surface Carrez du 1er lot 3039921 2eme lot 3111925 Surface Carrez du 2eme lot 3258320 3eme lot 3292644 Surface Carrez du 3eme lot 3322123 4eme lot 3316424 Surface Carrez du 4eme lot 3327276 5eme lot 3323135 Surface Carrez du 5eme lot 3328380 Nombre de lots 0 Code type local 1512780 Type local 1512780 Identifiant local 3329147 Surface reelle bati 1515599 Nombre pieces principales 1515599 Nature culture 1050173 Nature culture speciale 3175092 Surface terrain 1050173 dtype: int64 * Dropping columns with more then 50% of NaN -> 22 columns dropped! * Number of NaN after the first cleaning in the 21 columns left: No disposition 0 Date mutation 0 Nature mutation 0 Valeur fonciere 31915 No voie 1398919 Type de voie 1444023 Code voie 30389 Voie 30430 Code postal 30556 Commune 0 Code departement 0 Code commune 0 Section 85 No plan 0 Nombre de lots 0 Code type local 1512780 Type local 1512780 Surface reelle bati 1515599 Nombre pieces principales 1515599 Nature culture 1050173 Surface terrain 1050173 dtype: int64 ----> Year: 2019 Processing cleaning... *Code service CH 3533211 Reference document 3533211 1 Articles CGI 3533211 2 Articles CGI 3533211 3 Articles CGI 3533211 4 Articles CGI 3533211 5 Articles CGI 3533211 No disposition 0 Date mutation 0 Nature mutation 0 Valeur fonciere 40825 No voie 1447761 B/T/Q 3380766 Type de voie 1493221 Code voie 33771 Voie 33838 Code postal 33937 Commune 0 Code departement 0 Code commune 0 Prefixe de section 3363230 Section 104 No plan 0 No Volume 3523727 1er lot 2436238 Surface Carrez du 1er lot 3234737 2eme lot 3304923 Surface Carrez du 2eme lot 3458801 3eme lot 3495391 Surface Carrez du 3eme lot 3525820 4eme lot 3520085 Surface Carrez du 4eme lot 3531180 5eme lot 3526863 Surface Carrez du 5eme lot 3532265 Nombre de lots 0 Code type local 1632190 Type local 1632190 Identifiant local 3533211 Surface reelle bati 1634823 Nombre pieces principales 1634823 Nature culture 1112059 Nature culture speciale 3375916 Surface terrain 1112059 dtype: int64 * Dropping columns with more then 50% of NaN -> 22 columns dropped! * Number of NaN after the first cleaning in the 21 columns left: No disposition 0 Date mutation 0 Nature mutation 0 Valeur fonciere 40825 No voie 1447761 Type de voie 1493221 Code voie 33771 Voie 33838 Code postal 33937 Commune 0 Code departement 0 Code commune 0 Section 104 No plan 0 Nombre de lots 0 Code type local 1632190 Type local 1632190 Surface reelle bati 1634823 Nombre pieces principales 1634823 Nature culture 1112059 Surface terrain 1112059 dtype: int64 ----> Year: 2020 Processing cleaning... *Code service CH 2459560 Reference document 2459560 1 Articles CGI 2459560 2 Articles CGI 2459560 3 Articles CGI 2459560 4 Articles CGI 2459560 5 Articles CGI 2459560 No disposition 0 Date mutation 0 Nature mutation 0 Valeur fonciere 31752 No voie 990345 B/T/Q 2352374 Type de voie 1017945 Code voie 30948 Voie 31123 Code postal 30992 Commune 0 Code departement 0 Code commune 0 Prefixe de section 2350554 Section 101 No plan 0 No Volume 2452794 1er lot 1702571 Surface Carrez du 1er lot 2250280 2eme lot 2303817 Surface Carrez du 2eme lot 2408519 3eme lot 2434251 Surface Carrez du 3eme lot 2454703 4eme lot 2450649 Surface Carrez du 4eme lot 2458303 5eme lot 2455417 Surface Carrez du 5eme lot 2458997 Nombre de lots 0 Code type local 1143726 Type local 1143726 Identifiant local 2459560 Surface reelle bati 1145290 Nombre pieces principales 1145290 Nature culture 767675 Nature culture speciale 2352011 Surface terrain 767675 dtype: int64 * Dropping columns with more then 50% of NaN -> 22 columns dropped! * Number of NaN after the first cleaning in the 21 columns left: No disposition 0 Date mutation 0 Nature mutation 0 Valeur fonciere 31752 No voie 990345 Type de voie 1017945 Code voie 30948 Voie 31123 Code postal 30992 Commune 0 Code departement 0 Code commune 0 Section 101 No plan 0 Nombre de lots 0 Code type local 1143726 Type local 1143726 Surface reelle bati 1145290 Nombre pieces principales 1145290 Nature culture 767675 Surface terrain 767675 dtype: int64
| No disposition | Date mutation | Nature mutation | Valeur fonciere | No voie | Type de voie | Code voie | Voie | Code postal | Commune | ... | Nombre de lots | Code type local | Type local | Surface reelle bati | Nombre pieces principales | Nature culture | Surface terrain | Prix MC ST | Prix MC SRB | Nombre de vente | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 13 | 1 | 06/01/2020 | Vente | 180300.0 | 31.0 | RUE | 0970 | COMTE DE LA TEYSSONNIERE | 1000 | BOURG-EN-BRESSE | ... | 0 | 1.0 | Maison | 75.0 | 4.0 | S | 525.0 | 343.428571 | 2404.000000 | 1 |
| 16 | 1 | 03/01/2020 | Vente | 350750.0 | 904.0 | CHE | 0149 | DES GDES CADALLES | 1000 | SAINT-DENIS-LES-BOURG | ... | 0 | 1.0 | Maison | 201.0 | 7.0 | J | 1267.0 | 276.835043 | 1745.024876 | 1 |
| 17 | 1 | 03/01/2020 | Vente | 350750.0 | 904.0 | CHE | 0149 | DES GDES CADALLES | 1000 | SAINT-DENIS-LES-BOURG | ... | 0 | 1.0 | Maison | 201.0 | 7.0 | S | 1497.0 | 234.301937 | 1745.024876 | 1 |
| 18 | 1 | 03/01/2020 | Vente | 350750.0 | 904.0 | CHE | 0149 | DES GDES CADALLES | 1000 | SAINT-DENIS-LES-BOURG | ... | 0 | 3.0 | Dépendance | 0.0 | 0.0 | S | 1497.0 | 234.301937 | 0.000000 | 1 |
| 19 | 1 | 03/01/2020 | Vente | 350750.0 | 904.0 | CHE | 0149 | DES GDES CADALLES | 1000 | SAINT-DENIS-LES-BOURG | ... | 0 | 3.0 | Dépendance | 0.0 | 0.0 | J | 1267.0 | 276.835043 | 0.000000 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2459554 | 1 | 16/12/2020 | Vente | 1937500.0 | 24.0 | RUE | 1748 | CHANOINESSE | 75004 | PARIS 04 | ... | 0 | 2.0 | Appartement | 27.0 | 2.0 | S | 447.0 | 4334.451902 | 71759.259259 | 1 |
| 2459555 | 1 | 16/12/2020 | Vente | 1937500.0 | 24.0 | RUE | 1748 | CHANOINESSE | 75004 | PARIS 04 | ... | 0 | 2.0 | Appartement | 87.0 | 4.0 | S | 447.0 | 4334.451902 | 22270.114943 | 1 |
| 2459556 | 1 | 16/12/2020 | Vente | 1937500.0 | 24.0 | RUE | 1748 | CHANOINESSE | 75004 | PARIS 04 | ... | 0 | 3.0 | Dépendance | 0.0 | 0.0 | S | 447.0 | 4334.451902 | 0.000000 | 1 |
| 2459557 | 1 | 16/12/2020 | Vente | 1937500.0 | 24.0 | RUE | 1748 | CHANOINESSE | 75004 | PARIS 04 | ... | 0 | 2.0 | Appartement | 82.0 | 2.0 | S | 447.0 | 4334.451902 | 23628.048780 | 1 |
| 2459558 | 1 | 16/12/2020 | Vente | 1937500.0 | 24.0 | RUE | 1748 | CHANOINESSE | 75004 | PARIS 04 | ... | 0 | 3.0 | Dépendance | 0.0 | 0.0 | S | 447.0 | 4334.451902 | 0.000000 | 1 |
590222 rows × 24 columns
| No disposition | Date mutation | Nature mutation | Valeur fonciere | No voie | Type de voie | Code voie | Voie | Code postal | Commune | ... | Nombre de lots | Code type local | Type local | Surface reelle bati | Nombre pieces principales | Nature culture | Surface terrain | Prix MC ST | Prix MC SRB | Nombre de vente | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 13 | 1 | 06/01/2020 | Vente | 180300.0 | 31.0 | RUE | 0970 | COMTE DE LA TEYSSONNIERE | 1000 | BOURG-EN-BRESSE | ... | 0 | 1.0 | Maison | 75.0 | 4.0 | S | 525.0 | 343.428571 | 2404.000000 | 1 |
| 16 | 1 | 03/01/2020 | Vente | 350750.0 | 904.0 | CHE | 0149 | DES GDES CADALLES | 1000 | SAINT-DENIS-LES-BOURG | ... | 0 | 1.0 | Maison | 201.0 | 7.0 | J | 1267.0 | 276.835043 | 1745.024876 | 1 |
| 17 | 1 | 03/01/2020 | Vente | 350750.0 | 904.0 | CHE | 0149 | DES GDES CADALLES | 1000 | SAINT-DENIS-LES-BOURG | ... | 0 | 1.0 | Maison | 201.0 | 7.0 | S | 1497.0 | 234.301937 | 1745.024876 | 1 |
| 18 | 1 | 03/01/2020 | Vente | 350750.0 | 904.0 | CHE | 0149 | DES GDES CADALLES | 1000 | SAINT-DENIS-LES-BOURG | ... | 0 | 3.0 | Dépendance | 0.0 | 0.0 | S | 1497.0 | 234.301937 | 0.000000 | 1 |
| 19 | 1 | 03/01/2020 | Vente | 350750.0 | 904.0 | CHE | 0149 | DES GDES CADALLES | 1000 | SAINT-DENIS-LES-BOURG | ... | 0 | 3.0 | Dépendance | 0.0 | 0.0 | J | 1267.0 | 276.835043 | 0.000000 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2459554 | 1 | 16/12/2020 | Vente | 1937500.0 | 24.0 | RUE | 1748 | CHANOINESSE | 75004 | PARIS 04 | ... | 0 | 2.0 | Appartement | 27.0 | 2.0 | S | 447.0 | 4334.451902 | 71759.259259 | 1 |
| 2459555 | 1 | 16/12/2020 | Vente | 1937500.0 | 24.0 | RUE | 1748 | CHANOINESSE | 75004 | PARIS 04 | ... | 0 | 2.0 | Appartement | 87.0 | 4.0 | S | 447.0 | 4334.451902 | 22270.114943 | 1 |
| 2459556 | 1 | 16/12/2020 | Vente | 1937500.0 | 24.0 | RUE | 1748 | CHANOINESSE | 75004 | PARIS 04 | ... | 0 | 3.0 | Dépendance | 0.0 | 0.0 | S | 447.0 | 4334.451902 | 0.000000 | 1 |
| 2459557 | 1 | 16/12/2020 | Vente | 1937500.0 | 24.0 | RUE | 1748 | CHANOINESSE | 75004 | PARIS 04 | ... | 0 | 2.0 | Appartement | 82.0 | 2.0 | S | 447.0 | 4334.451902 | 23628.048780 | 1 |
| 2459558 | 1 | 16/12/2020 | Vente | 1937500.0 | 24.0 | RUE | 1748 | CHANOINESSE | 75004 | PARIS 04 | ... | 0 | 3.0 | Dépendance | 0.0 | 0.0 | S | 447.0 | 4334.451902 | 0.000000 | 1 |
590222 rows × 24 columns
def Number_SalesPer(data, column, year):
X=data.groupby([column])[column].count() #count the number of sale/department (each time a department appear, sale ++
txt = "Nombre de ventes par " + column + " en " + year
figure=px.bar(X, barmode='relative',color=X, color_discrete_sequence=px.colors.qualitative.G10, title=txt,height=600, width=1000, labels=[column, "Nombre de ventes"])
#color_discrete_sequence= px.colors.sequential.Plasma_r
#figure.show()
return figure
#Nombre de vente par departement
year = "2020"
clean_data = clean_datasets[year]
Number_SalesPer(clean_data, 'Code departement', year)
#Nombre de vente par ville (code postal) (Graph peut intéressant car beaucoup trop de ville)
year = "2016"
clean_data = clean_datasets[year]
Number_SalesPer(clean_data, 'Code postal', year)
def Number_SalesPer_Commune(data, departement, year):
data['Nombre de vente']=1
Y=data[(data['Code departement'] == departement)] #prend que les lignes du département concerné
X=Y.groupby(['Commune'])['Nombre de vente'].count().reset_index() #trouve le nombre de vente par commune du département
txt = "Nombre de vente par commune du " + departement + " en " + year
figure=px.treemap(X, path=['Commune'], values='Nombre de vente',color_discrete_sequence = px.colors.qualitative.Dark2, title=txt,height=600, width=1000)
figure.data[0].textinfo = 'label+value' #pour afficher le nom de la commune et le nombre de vente
#figure.show()
return figure
#Nombre de vente par commune d'un departement
year = "2019"
clean_data = clean_datasets[year]
# Departement
clean_data
Number_SalesPer_Commune(clean_data, '92', year)
def Communes_Top_Ventes(data, year, n, hover_data=[]):
data['Nombre de vente']=1
b=data.groupby(['Commune'])['Nombre de vente'].count().reset_index() #trouve le nombre de vente par commune du département
fig = px.bar(b.sort_values('Nombre de vente', ascending=True).tail(n),
x='Nombre de vente', y="Commune", color='Nombre de vente',
text='Commune', orientation='h', width=700, hover_data=hover_data,
color_discrete_sequence = px.colors.qualitative.Dark2)
fig.update_layout(title="Top des " + str(n) + " communes avec le plus de ventes en France en " + year, xaxis_title="", yaxis_title="",
yaxis_categoryorder = 'total ascending',
uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()
# Nombre de vente par commune d'un departement
year = "2020"
clean_data = clean_datasets[year]
Communes_Top_Ventes(clean_data, year, 20)
# donne des valeurs étranges (Se concentrer sur la surface de terrain et non la surface réelle bâti)
def AvgPrix_MCSurfaceBati_Dep(data, year):
X=data.groupby(['Code departement'])['Prix MC SRB'].mean()
txt = "Prix moyen du mètre carré de la surface réelle bâtie par département en " + year
figure=px.bar(X, color=X, barmode='relative',color_discrete_sequence=px.colors.qualitative.G10, title=txt,height=600, width=1000)
return figure
# Prix moyen du mètre carré de la surface réelle bâti d'un département sur une année.
year = "2019"
clean_data = clean_datasets[year]
AvgPrix_MCSurfaceBati_Dep(clean_data, year)
#les résultats paraissent plus cohérents mais certaines valeurs restent étranges (Paris)
def AvgPrice_MCTerrain_Dep(data, year):
X=data.groupby(['Code departement'])['Prix MC ST'].mean()
txt = "Prix moyen du mètre carré de la surface de terrain par département en " + year
figure=px.bar(X, color=X, barmode='relative',color_discrete_sequence=px.colors.qualitative.G10, title=txt,height=600, width=1000)
return figure
# Prix moyen du mètre carré de la surface de terrain par département
year = "2018"
clean_data = clean_datasets[year]
AvgPrice_MCTerrain_Dep(clean_data, year)
def AvgPrice_MCTerrain_Commune(data, departement, year):
Y=data[(data['Code departement'] == departement)] #prend que les lignes du département concerné
X=Y.groupby(['Commune'])['Prix MC ST'].mean()
txt = "Prix moyen du mètre carré de la surface de terrain par commune du " + departement + " en " + year
figure=px.bar(X, color=X, barmode='relative',color_discrete_sequence=px.colors.qualitative.G10, title=txt,height=600, width=1000)
return figure
# Prix moyen du mètre carré de la surface de terrain des communes d'un département
year = "2020"
clean_data = clean_datasets[year]
AvgPrice_MCTerrain_Commune(clean_data, '75', year)
def SurfaceTerrain_NatureCulture(data, year):
d = data.copy()
d= d.replace('', np.nan).fillna(0)
d.drop(d[d['Nature culture'] ==0].index, inplace = True)
txt = "Surface des terrains en fonction de la nature des cultures en " + year
fig = px.treemap(d, path=["Nature culture"], values="Surface terrain", height=500, width=1000, title=txt, color_discrete_sequence=px.colors.qualitative.G10)
fig.data[0].textinfo = 'label+text+value'
return fig
# Prix moyen du mètre carré de la surface de terrain des communes d'un département
year = "2017"
clean_data = clean_datasets[year]
SurfaceTerrain_NatureCulture(clean_data, year)
year = "2017"
clean_data = clean_datasets[year]
fig = px.pie(clean_data, names="Nature mutation", height=700, width= 1000, title="proportion des types de bien en " + year)
fig
def Repartion_TypeLocal(data, year):
X=data.groupby(by='Type local').size().reset_index(name='total')
txt = 'Répartition par type de local en ' + year
fig = px.treemap(X,title=txt, path=["Type local"], values='total', height=400, width=1000, color_discrete_sequence=['red','orange', 'gold', 'yellow'])
fig.data[0].textinfo = 'label+value'
return fig
# Répartition par type de local
year = "2018"
clean_data = clean_datasets[year]
Repartion_TypeLocal(clean_data, year)
def ValeurFonciere_NatureMutation(data, year):
df=data.copy()
df.drop(df[df['Nature mutation'] ==0].index, inplace = True)
df.drop(df[df['Valeur fonciere'] >=1500000].index, inplace = True)
df["Nature mutation"] = df["Nature mutation"].astype(str)
df["Date mutation"]=pd.to_datetime(df["Date mutation"], format="%d/%m/%Y")
df.sort_values(by='Date mutation')
fig = px.scatter(df[::100],x="Date mutation",y="Valeur fonciere",color="Nature mutation", height=600, width=1150)
return fig
# Valeur Fonciere en fonction de la nature de mutation
year = "2020"
clean_data = clean_datasets[year]
ValeurFonciere_NatureMutation(clean_data, year)
def ValeurFonciere_NbPiecesP(data, year):
df = data.copy()
df.drop(df[df['Nombre pieces principales'] ==0].index, inplace = True)
df.drop(df[df['Valeur fonciere'] >=1500000].index, inplace = True)
fig = px.scatter(df[::100],x="Valeur fonciere",y="Nombre pieces principales", color= "Valeur fonciere", height=600, width=1000)
return fig
# Valeur Fonciere en fonction de la nature de mutation
year = "2016"
clean_data = clean_datasets[year]
ValeurFonciere_NbPiecesP(clean_data, year)
no_departement_map = {}
for feature in departement["features"]:
feature["id"] = feature["properties"]["nom"]
no_departement_map[feature["properties"]["code"]] = feature["id"]
def Map_AvgPrice_MCTerrain_Dep(data, year):
X=data.groupby(['Code departement'])['Prix MC ST'].mean().reset_index()
X["id"]=X["Code departement"].apply(lambda x: no_departement_map[x])
txt = "Prix moyen du m² de surface de terrain par département en France en " + year
fig = px.choropleth(X, color='Prix MC ST', locations = 'id', geojson=departement, scope='europe', color_continuous_scale="RdGy",title=txt, height=600, width=1000)
return fig
# Prix moyen du mètre carré de la surface de terrain des communes d'un département
year = "2017"
clean_data = clean_datasets[year]
Map_AvgPrice_MCTerrain_Dep(clean_data, year)
def Map_NombreVente_dep(data, year):
X=data.groupby(['Code departement'])['Nombre de vente'].count().reset_index() #trouve le nombre de vente par département
X["id"]=X["Code departement"].apply(lambda x: no_departement_map[x])
txt = 'Nombre ventes par département en ' + year
fig = px.choropleth(X, color='Nombre de vente', locations = 'id', geojson=departement, scope='europe', color_continuous_scale="RdGy", title=txt, height=600, width= 1000)
return fig
# Carte de France et nombre de vente par département
year = "2020"
clean_data = clean_datasets[year]
Map_NombreVente_dep(clean_data, year)
# Boite à moustache sur la valeur fonciere des départements de France sur une année
def Bar_ValeurFonciere_dep(data, year):
txt = 'Distribution de la valeur fonciere dans les départements de France en ' + year
figure= px.box(data, x='Code departement', y='Valeur fonciere', points=False,color='Code departement', title=txt, height=600, width=1000)
return figure
# Valeur fonciere par département et par année
year = "2020"
clean_data = clean_datasets[year]
Bar_ValeurFonciere_dep(clean_data, year)
# Boite à moustache sur la valeur fonciere d'un département en particulier sur une année
def BoxP_ValeurFonciere_dep(data, year, departement):
X=data[(data['Code departement'] == departement)]
txt = 'Distribution de la valeur fonciere dans le ' + departement + " en " + year
figure= px.box(X, x='Code departement', y='Valeur fonciere', points=False,color='Code departement',title=txt, height=600, width=1000)
return figure
# Valeur fonciere par département et par année
year = "2017"
clean_data = clean_datasets[year]
BoxP_ValeurFonciere_dep(clean_data, year, "90")
#Boite à moustache sur la valeur fonciere d'une commune en particulier sur une année
def BoxP_ValeurFonciere_commune(data, year, commune):
X=data[(data['Commune'] == commune)]
txt = "Distribution de la valeur fonciere à " + commune + " de France en " + year
figure= px.box(X, x='Commune', y='Valeur fonciere', points=False,color='Commune', title=txt, height=600, width=1000)
return figure
# Valeur fonciere par département et par année
year = "2017"
clean_data = clean_datasets[year]
BoxP_ValeurFonciere_commune(clean_data, year, 'RUEIL-MALMAISON')
def Moustache_NbrPiece_SRBatie(data, year):
df=data.copy()[::100]
df.drop(df[df['Nombre pieces principales'] ==0].index, inplace = True)
df.drop(df[df['Nombre pieces principales'] >10].index, inplace = True)
txt = "Nombre de piece principales par surface reelle batie en " + year
fig = px.box(df, x="Nombre pieces principales", y="Surface reelle bati", title=txt, height=600, width=1000)
return fig
# Nombre de piece principales par surface reelle batie
year = "2020"
clean_data = clean_datasets[year]
Moustache_NbrPiece_SRBatie(clean_data, year)
dataCovid = pd.read_csv("data/covid data/worldometer_data.csv", sep=";")
dataCovid.dtypes
dataCovid.Date = dataCovid.Date.apply(lambda x: dt.datetime.strptime(x, '%d/%m/%Y'))
dataCovid.dtypes
Map_Covid(dataCovid)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-77-9c14f197c3e1> in <module> 1 dataCovid = pd.read_csv("data/covid data/worldometer_data.csv", sep=";") 2 dataCovid.dtypes ----> 3 dataCovid.Date = dataCovid.Date.apply(lambda x: dt.datetime.strptime(x, '%d/%m/%Y')) 4 dataCovid.dtypes 5 Map_Covid(dataCovid) C:\Python39\lib\site-packages\pandas\core\generic.py in __getattr__(self, name) 5463 if self._info_axis._can_hold_identifiers_and_holds_name(name): 5464 return self[name] -> 5465 return object.__getattribute__(self, name) 5466 5467 def __setattr__(self, name: str, value) -> None: AttributeError: 'DataFrame' object has no attribute 'Date'
## *19) Cas covid en Fonction des transactions* ##
def NewCases_Transaction_TimeSeries(data2020, dataCovid):
df = data2020.copy()
df['Nombre de Transactions']=1
df["Date mutation"]=pd.to_datetime(df["Date mutation"], format="%d/%m/%Y")
df
X=df.groupby(['Date mutation'])['Nombre de Transactions'].count().reset_index()
fig = px.line(df,x="Date mutation",y="Nombre de Transactions", height=600, width=1000)
#fig.add_hline(x=fruits, y=[2,1,3], name="Last year")
return fig
# Prepare Covid data
dataCovid = pd.read_csv("data/covid data/dataCovid.csv", sep=";")
clean_dataCovid = data_cleaning(dataCovid, perc_NaN=20)
Processing cleaning... *Date,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,No. of countries 0 dtype: int64 * Dropping columns with more then 20% of NaN -> 0 columns dropped! * Number of NaN after the first cleaning in the 1 columns left: Date,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,No. of countries 0 dtype: int64
# Cas covid en Fonction des transactions
clean_data = clean_datasets["2020"]
NewCases_Transaction_TimeSeries(clean_data, clean_dataCovid)
dataCovid = pd.read_csv("data/covid data/dataCovid.csv", sep=";")
clean_dataCovid = data_cleaning(dataCovid, perc_NaN=20)
clean_dataCovid
Processing cleaning... *Date,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,No. of countries 0 dtype: int64 * Dropping columns with more then 20% of NaN -> 0 columns dropped! * Number of NaN after the first cleaning in the 1 columns left: Date,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,No. of countries 0 dtype: int64
| Date,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,No. of countries | |
|---|---|
| 0 | 2020-01-22,555,17,28,510,0,0,0,3.06,5.05,60.71,6 |
| 1 | 2020-01-23,654,18,30,606,99,1,2,2.75,4.59,60.0,8 |
| 2 | 2020-01-24,941,26,36,879,287,8,6,2.76,3.83,72.... |
| 3 | 2020-01-25,1434,42,39,1353,493,16,3,2.93,2.72,... |
| 4 | 2020-01-26,2118,56,52,2010,684,14,13,2.64,2.46... |
| ... | ... |
| 183 | 2020-07-23,15510481,633506,8710969,6166006,282... |
| 184 | 2020-07-24,15791645,639650,8939705,6212290,281... |
| 185 | 2020-07-25,16047190,644517,9158743,6243930,255... |
| 186 | 2020-07-26,16251796,648621,9293464,6309711,204... |
| 187 | 2020-07-27,16480485,654036,9468087,6358362,228... |
188 rows × 1 columns
## 20)
def CasCovid_2020(data, year):
df = data.copy()
df["Date mutation"]=pd.to_datetime(df["Date mutation"], format="%d/%m/%Y")
df.drop(df[df['Date mutation'].dt.month == 8].index, inplace = True)
df.drop(df[df['Date mutation'].dt.month == 9].index, inplace = True)
df.drop(df[df['Date mutation'].dt.month == 10].index, inplace = True)
df.drop(df[df['Date mutation'].dt.month == 11].index, inplace = True)
df.drop(df[df['Date mutation'].dt.month == 12].index, inplace = True)
c= list(df["Date mutation"].value_counts())
df['New cases']=clean_dataCovid['New cases']
df['Date']=clean_dataCovid['Date']
df["Date"]=pd.to_datetime(df["Date"], format="%d/%m/%Y")
dbis = pd.DataFrame({'Date': df['Date'],'New cases':df['New cases'],'Date mutation':df['Date mutation'],'total':df['total']})
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=dbis['Date'], y=dbis['New cases'], name="nouveau cas de covid"),
secondary_y=False,
)
fig.add_trace(
go.Scatter(x=dbis['Date'], y=c, name="nombre de vente"),
secondary_y=True,
)
fig.update_layout(title_text="title")
fig.update_xaxes(title_text="date")
fig.update_yaxes(title_text=" Nouveau cas covid", secondary_y=False)
fig.update_yaxes(title_text=" nombre de transactions immobilières", secondary_y=True)
return fig
# Cas covid en 2020
year = "2020"
clean_data = clean_datasets[year]
CasCovid_2020(data, year)
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) C:\Python39\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance) 3079 try: -> 3080 return self._engine.get_loc(casted_key) 3081 except KeyError as err: pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'New cases' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) <ipython-input-84-ea9ff3f7063f> in <module> 2 year = "2020" 3 clean_data = clean_datasets[year] ----> 4 CasCovid_2020(data, year) <ipython-input-83-7a414a789230> in CasCovid_2020(data, year) 10 c= list(df["Date mutation"].value_counts()) 11 ---> 12 df['New cases']=clean_dataCovid['New cases'] 13 df['Date']=clean_dataCovid['Date'] 14 df["Date"]=pd.to_datetime(df["Date"], format="%d/%m/%Y") C:\Python39\lib\site-packages\pandas\core\frame.py in __getitem__(self, key) 3022 if self.columns.nlevels > 1: 3023 return self._getitem_multilevel(key) -> 3024 indexer = self.columns.get_loc(key) 3025 if is_integer(indexer): 3026 indexer = [indexer] C:\Python39\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance) 3080 return self._engine.get_loc(casted_key) 3081 except KeyError as err: -> 3082 raise KeyError(key) from err 3083 3084 if tolerance is not None: KeyError: 'New cases'
def NombreVentes_PerYear(dataset):
X=pd.DataFrame({'Nombre de vente': [len(dataset["2016"]),
len(dataset["2017"]),
len(dataset["2018"]),
len(dataset["2019"]),
len(dataset["2020"])],
'Annee':['2016', '2017', '2018', '2019', '2020']})
figure=X.plot.bar(x='Annee',rot=0, title="Nombre de vente par année")
return figure
# Nombre de vente par année
NombreVentes_PerYear(clean_datasets)
<matplotlib.axes._subplots.AxesSubplot at 0x28012086848>
def NbrVentesLocal_TimeSeries(dataset, type_local):
a=len(dataset["2016"][(dataset["2016"]['Type local'] == type_local)])
b=len(dataset["2017"][(dataset["2017"]['Type local'] == type_local)])
c=len(dataset["2018"][(dataset["2018"]['Type local'] == type_local)])
d=len(dataset["2019"][(dataset["2019"]['Type local'] == type_local)])
e=len(dataset["2020"][(dataset["2020"]['Type local'] == type_local)])
txt = "Nombre de vente de " + type_local + " par année"
b=pd.DataFrame({txt: [a,b,c,d,e],'Annee':['2016', '2017', '2018', '2019', '2020']})
figure=b.plot.bar(x='Annee',rot=0, title=txt)
return figure
# Nombre de vente d'appartement
NbrVentesLocal_TimeSeries(clean_datasets, "Appartement")
NbrVentesLocal_TimeSeries(clean_datasets, "Maison")
NbrVentesLocal_TimeSeries(clean_datasets, "Dépendance")
<matplotlib.axes._subplots.AxesSubplot at 0x28014c637c8>
def AvgPrice_MC_Dep(dataset, departement):
a=dataset["2016"].groupby(['Code departement'])['Prix MC ST'].mean()
b=dataset["2017"].groupby(['Code departement'])['Prix MC ST'].mean()
c=dataset["2018"].groupby(['Code departement'])['Prix MC ST'].mean()
d=dataset["2019"].groupby(['Code departement'])['Prix MC ST'].mean()
e=dataset["2020"].groupby(['Code departement'])['Prix MC ST'].mean()
txt = "Prix moyen du MC dans le " + departement + " sur les 5 ans"
b=pd.DataFrame({txt: [a[departement],b[departement],c[departement],d[departement],e[departement]],'Annee':['2016', '2017', '2018', '2019', '2020']})
figure=b.plot.bar(x='Annee',rot=0, title=txt)
return figure
# Prix du MC dans un département sur les 5 ans
AvgPrice_MC_Dep(clean_datasets, "01")
<matplotlib.axes._subplots.AxesSubplot at 0x28012d93ac8>
def AvgPrice_MC_Commune(dataset, commune):
a=dataset["2016"].groupby(['Commune'])['Prix MC ST'].mean()
b=dataset["2017"].groupby(['Commune'])['Prix MC ST'].mean()
c=dataset["2018"].groupby(['Commune'])['Prix MC ST'].mean()
d=dataset["2019"].groupby(['Commune'])['Prix MC ST'].mean()
e=dataset["2020"].groupby(['Commune'])['Prix MC ST'].mean()
txt = "Prix moyen MC à " + commune + " de 2016 à 2020"
X=pd.DataFrame({txt: [a[commune],b[commune],c[commune],d[commune],e[commune]],'Annee':['2016', '2017', '2018', '2019', '2020']})
figure=X.plot.bar(x='Annee',rot=0, title=txt)
return figure
# Prix moyen du MC dans une commune sur 5 ans
AvgPrice_MC_Commune(clean_datasets, 'AMIENS')
<matplotlib.axes._subplots.AxesSubplot at 0x2800d1b50c8>
def Avg_ValeurFonciere_NatureMutation(data_year_1, data_year_2, data_year_3):
moy1 = list(data_year_1.groupby("Nature mutation")["Valeur fonciere"].mean())
moy2 = list(data_year_2.groupby("Nature mutation")["Valeur fonciere"].mean())
moy3 = list(data_year_3.groupby("Nature mutation")["Valeur fonciere"].mean())
moy = moy1+moy2+moy3
gr = ["2016"]*6+["2018"]*6+["2020"]*6
nm = list(data_year_1['Nature mutation'].unique())*3
df = pd.DataFrame({'nature mutation':nm ,'valeur fonciere moyenne':moy,'annee':gr })
fig = px.bar(df,x="valeur fonciere moyenne", y="nature mutation",text=moy, orientation='h',color="annee",barmode='group', height=600, width=1000)
return fig
# Nature de mutation en fct de la valeur fonciere moyenne
year_1, year_2, year_3 = "2016", "2018", "2020"
clean_data_1 = clean_datasets[year_1]
clean_data_2 = clean_datasets[year_2]
clean_data_3 = clean_datasets[year_3]
Avg_ValeurFonciere_NatureMutation(clean_data_1, clean_data_2, clean_data_3)
def Mutations_TimeSeries(data_year_1, data_year_2):
d1 = data_year_1.copy()
d2 = data_year_2.copy()
d1.drop(d1[d1['Date mutation'] == 0].index, inplace = True)
d2.drop(d2[d2['Date mutation'] == 0].index, inplace = True)
d1['Date mutation']=pd.to_datetime(d1['Date mutation'], format='%d/%m/%Y')
d2['Date mutation']=pd.to_datetime(d2['Date mutation'], format='%d/%m/%Y')
d1['month'] = d1['Date mutation'].dt.strftime('%m')
d2['month'] = d2['Date mutation'].dt.strftime('%m')
count_1 = list(d1["month"].value_counts())
count_2 = list(d2["month"].value_counts())
c = count_1 + count_2
ans = ["2020"]*12+["2016"]*12
mois = list(d1['month'].unique())*2
txt = "Evolution des mutations pendant l'année entre 2016 et 2020"
df = pd.DataFrame({'month':mois ,'count':c,'annee':ans })
fig = px.bar(df,x="month", y="count",text=c,color="annee",barmode='group', width=1000, height=600, title=txt)
return fig
# Nature de mutation en fct de la valeur fonciere moyenne
year_1, year_2 = "2016", "2020"
clean_data_1 = clean_datasets[year_1]
clean_data_2 = clean_datasets[year_2]
Mutations_TimeSeries(clean_data_1, clean_data_2)
import plotly
plotly.offline.init_notebook_mode()